/* =============================================================================
EU - IPUMS data on migration, educ, hh size, noncitzens, age, labor

1) Harmonized IPUMS variables (AT, FR, CH, ES)
      a) Female employment 
      b) Education
      c) Household Size
      d) Home ownership

Source:
Minnesota Population Center. Integrated Public Use Microdata Series, 
International: Version 6.1 [Machine-readable database]. 
Minneapolis: University of Minnesota, 2011

Censuses for the following countries: 
AT 1971
FR 1968, 1975 
CH 1970
ES 1981 
============================================================================= */      

clear
set more off

cd "$scratch/IPUMS International Data"
! uncompress ipumsi_00012.dat.Z

quietly do ipumsi_00012.do

keep if cntry==40|cntry==250|cntry==756|cntry==724

gen country=""
replace country="AT" if cntry==40
replace country="FR" if cntry==250
replace country="CH" if cntry==756
replace country="ES" if cntry==724

drop if country=="FR" & year==1975 //use 1968 census since it is closer to 1970

save "/$scratch/all_countries_temp.dta", replace

drop if country=="ES"

tostring enuts3, replace force
gen nuts3 = country + substr(enuts3,2,4) if country=="AT"
	replace nuts3="AT111&AT113" if nuts3=="AT111"
replace nuts3 = country + substr(enuts3,3,5) if country=="EL"|country=="CH"

*Composite NUTS in IPUMS
replace nuts = "CH053&CH054" if nuts=="CH053"

tostring enuts2, replace force
gen nuts2 = country + substr(enuts2,2,3) if country=="AT"
	replace nuts2 = country + substr(enuts2,3,4)  if country=="EL"|country=="CH"|country=="FR"

gen nuts1 =substr(nuts2,1,3)
	
*a) Employment
gen EMP = 1 if eempsta==110 //Employed
gen EMP_female = 1 if eempsta==110 & sex==2 

*b) Education 
replace school = 1 if ch70a_edatt2==9 //from unharmonized Switzerland education variable, it is possible to separate out some of those still in school from unknown
gen inschool = 1 if school==1 

*years of school
forval n = 0/21 {
	gen edatt_yrs_`n' = .
}

*1) Austria (AT) 
replace edatt_yrs_8 = 1 if educat == 10  & age>=15 & school !=1 
replace edatt_yrs_11 = 1 if educat == 23  & age>=15 & school !=1 
replace edatt_yrs_12 = 1 if (educat == 21 | educat == 24) & age>=15 & school !=1 
replace edatt_yrs_13 = 1 if educat == 22  & age>=15 & school !=1 
replace edatt_yrs_14 = 1 if educat == 30  & age>=15 & school !=1 
replace edatt_yrs_16 = 1 if educat == 40  & age>=15 & school !=1 

* CH added to step 3 due to unknown values

*2) France Note: universe in original variable ages 17+ 
replace edatt_yrs_5 = 1 if educfr==0  & age>=17 & school !=1  
replace edatt_yrs_5 = 1 if educfr==10  & age>=17 & school !=1 
replace edatt_yrs_9 = 1 if educfr==20  & age>=17 & school !=1 
replace edatt_yrs_11 = 1 if educfr==30  & age>=17 & school !=1 
replace edatt_yrs_12 = 1 if educfr==40  & age>=17 & school !=1 
replace edatt_yrs_17 =1 if educfr==50  & age>=17 & school !=1 

drop age edattan edattand educ*

*c) Household size 

bys serial: gen n=_n 
gen hh_size = persons if gq==10 & n==1 //one observation for each private household

*d) Home ownership 
*calculate number of people in a household 

gen home_own = 1/wtper*wthh if ownrshp==1 &  gq==10 & n==1 
gen home_no_own = 1/wtper*wthh if ownrshp ==2 &  gq==10 & n==1 
gen home_unknown_own = 1/wtper*wthh if ownrshp ==9 & gq==10 & n==1

tempfile all_countries_precollapse
save `all_countries_precollapse.dta'

*separate out collapse by country to speed up collapse
local countries "AT CH FR"

foreach country of local countries { 



	display "`country'"
	
	use EMP* edatt* home* hh_size year country wtper nuts3 using ///
		`all_countries_precollapse.dta' if country=="`country'", clear

	drop if nuts3=="" 

	capture collapse (sum) EMP* edatt* home* (mean) hh_size (first) year ///
		country [fw=wtper], by(nuts3) fast

	capture rename nuts3 nuts

	capture tempfile `country'_nuts3
	capture save ``country'_nuts3.dta'

	use EMP* edatt* home* hh_size year country wtper nuts2 using ///
		`all_countries_precollapse.dta' if country=="`country'", clear

	drop if nuts2=="" 
	capture collapse (sum) EMP* edatt* home* (mean) hh_size (first) year ///
		country [fw=wtper], by(nuts2) fast

	capture rename nuts2 nuts

	capture tempfile `country'_nuts2
	capture save ``country'_nuts2.dta'
	
	use EMP* edatt* home* hh_size year country wtper nuts1 using ///
		`all_countries_precollapse.dta' if country=="`country'", clear
	
	collapse (sum) EMP* edatt* home* (mean) hh_size (first) year ///
		[fw=wtper], by(nuts1) fast

	rename nuts1 nuts

	tempfile `country'_nuts1
	save ``country'_nuts1.dta'
	
	use EMP* edatt* home* hh_size year country wtper using ///
		`all_countries_precollapse.dta' if country=="`country'", clear

	collapse (sum) EMP* edatt* home* (mean) hh_size (first) year /// 
		[fw=wtper], by(country) fast

	rename country nuts

	append using ``country'_nuts1.dta'
	append using ``country'_nuts2.dta'
	append using ``country'_nuts3.dta'

	tempfile `country'_census
	save ``country'_census.dta'

}

use `AT_census.dta', clear
	append using `CH_census.dta'
	append using `FR_census.dta'

foreach var of varlist edatt* {
	
	replace `var' =. if country=="CH" //CH has missing values for education. Below I calculate shares for those with education data not missing and multiply by POP initial conditions for ages 15+
	
}

sort nuts 
save "$scratch/IC_EU_IPUMS.dta", replace 

*===============================================================================
*3) Estimate education level by 5 year age cohorts for CH & ES
*===============================================================================

use "/$scratch/all_countries_temp.dta" if country=="CH"| country=="ES", clear

gen init_yr_age = .
	replace init_yr_age = 1970 if country=="CH"
	replace init_yr_age = 1971 if country=="ES"	

gen age_initial = age - (year - init_yr_age) 

keep if age_initial >= 15

tostring enuts3, replace  force

gen nuts3 = country + substr(enuts3,3,5) if country=="CH"
	replace nuts3= country+ substr(enuts3,2,4) if country=="ES"

*Composite Nuts in IPUMS. 
	replace nuts = "CH053&CH054" if nuts=="CH053"

gen age_group = .
	replace age_group = 1 if age_initial>=15 & age_initial<=19
	replace age_group = 2 if age_initial>=20 & age_initial<=24
	replace age_group = 3 if age_initial>=25 & age_initial<=29
	replace age_group = 4 if age_initial>=30 & age_initial<=34
	replace age_group = 5 if age_initial>=35 & age_initial<=39
	replace age_group = 6 if age_initial>=40 & age_initial<=44
	replace age_group = 7 if age_initial>=44 & age_initial<=49
	replace age_group = 8 if age_initial>=50 & age_initial<=54
	replace age_group = 9 if age_initial>=55 & age_initial<=59
	replace age_group = 10 if age_initial>=60 & age_initial<=64
	replace age_group = 11 if age_initial>=65 & age_initial<=69
	replace age_group = 12 if age_initial>=70

*stocks of educational attainment
replace school = 1 if ch70a_edatt2==9 //from unharmonized Switzerland variable, it is possible to separate out some of those still in school from unknown

gen edatt_inschool = 1 if school ==1 & age_initial>=15
gen edatt_less_than_primary = 1 if eedatta == 10 & age_initial>=15 & school !=1 
gen edatt_primary = 1 if eedatta >= 20 & eedatt<=30 & age_initial>=15 & country!="CH" & school !=1 
	replace edatt_primary = 1 if educch ==2 & age_initial>=15 & country=="CH" & school !=1  //There seems to be an error in the harmonized variable for CH where lower secondar completers are classified as completing upper secondary
gen edatt_secondary = 1 if eedatta >= 40 & eedatta <= 50  & age_initial>=15 & country!="CH" & school !=1 
	replace edatt_secondary = 1 if educch>=3 & educch<=5 & age_initial>=15 & country=="CH"  & school !=1 
gen edatt_university =1 if eedatta == 60 & age_initial>=15 & school !=1 

*years of school
forval n=0/18 {
	gen edatt_yrs_`n' = .
}

*Switzerland (CH)
replace edatt_yrs_0 = 1 if educch == 1  & age_initial>=15 & school !=1 
replace edatt_yrs_9 = 1 if educch == 2  & age_initial>=15 & school !=1 
replace edatt_yrs_12 = 1 if educch == 3  & age_initial>=15 & school !=1 
replace edatt_yrs_12 = 1 if educch == 4  & age_initial>=15 & school !=1 
replace edatt_yrs_15 = 1 if educch == 5  & age_initial>=15 & school !=1 
replace edatt_yrs_16 = 1 if educch == 6  & age_initial>=15 & school !=1 

* Spain (ES)
replace edatt_yrs_0 = 1 if educes == 100  & age_initial>=15 & school !=1 
replace edatt_yrs_3 = 1 if educes == 201  & age_initial>=15 & school !=1 
replace edatt_yrs_5 = 1 if educes == 202  & age_initial>=15 & school !=1 
replace edatt_yrs_8 = 1 if educes == 203  & age_initial>=15 & school !=1 
replace edatt_yrs_12 = 1 if educes == 310  & age_initial>=15 & school !=1 
replace edatt_yrs_13 = 1 if educes == 411  & age_initial>=15 & school !=1 
replace edatt_yrs_14 = 1 if educes == 412  & age_initial>=15 & school !=1 
replace edatt_yrs_14 = 1 if educes == 421  & age_initial>=15 & school !=1 
replace edatt_yrs_16 = 1 if educes == 422  & age_initial>=15 & school !=1 

drop edattan edattand educch educgr educes educpt

gen weight= int(wtper) 

collapse (sum) edatt* [fw= weight], by(age_group nuts3) //collapse edatt by age group

rename nuts3 nuts

tempfile ed_nuts3
save `ed_nuts3.dta'

gen nuts2 = substr(nuts,1,4)

collapse (sum) edatt*, by(age_group nuts2) 

rename nuts2 nuts

tempfile ed_nuts2
save `ed_nuts2.dta'

gen nuts1 = substr(nuts,1,3)

collapse (sum) edatt*, by(age_group nuts1) 

rename nuts1 nuts

tempfile ed_nuts1
save `ed_nuts1.dta'

gen country=substr(nuts,1,2)

collapse (sum) edatt* , by(age_group country) 

rename country nuts

append using `ed_nuts1.dta'
append using `ed_nuts2.dta'
append using `ed_nuts3.dta'

egen total = rsum(edatt_inschool edatt_less_than_primary edatt_primary ///
		edatt_secondary edatt_university) 

foreach var of varlist edatt* {
	replace `var' = `var'/total
}

drop total 
drop if nuts==""

forval n=0/18 {
	rename edatt_yrs_`n' edatt_yrs_`n'_
}
reshape wide edatt*, i(nuts) j(age_group) 

merge 1:1 nuts using  "$dta_files/IC_EU_AGE.dta" 
	keep if _merge==3

local bottom = 15

forv x=1/12 {

	local top = `bottom'+4

	rename edatt_less_than_primary`x' edatt_LTP_age_`bottom'_`top'
	rename edatt_primary`x'  edatt_primary_age_`bottom'_`top'
	rename edatt_secondary`x' edatt_secondary_age_`bottom'_`top'
	rename edatt_university`x' edatt_university_age_`bottom'_`top'
	
	forval n = 0/18 {
	rename edatt_yrs_`n'_`x' edatt_yrs_`n'_age_`bottom'_`top' 
	}

	local bottom = `top'+1

}

	rename edatt_LTP_age_70_74 edatt_LTP_age_70_plus
	rename edatt_primary_age_70_74 edatt_primary_age_70_plus 
	rename edatt_secondary_age_70_74 edatt_secondary_age_70_plus
	rename edatt_university_age_70_74 edatt_university_age_70_plus
	
forval n = 0 /18 {
	rename edatt_yrs_`n'_age_70_74 edatt_yrs_`n'_age_70_plus
}

foreach var of varlist age_15_19 - age_70_plus {
	
	replace edatt_LTP_`var' = edatt_LTP_`var'*`var'
	replace edatt_primary_`var' = edatt_primary_`var'*`var'
	replace edatt_secondary_`var' = edatt_secondary_`var'*`var'
	replace edatt_university_`var' = edatt_university_`var'*`var'

	forval n = 0/ 18 {
		replace edatt_yrs_`n'_`var' = edatt_yrs_`n'_`var'*`var'
	}
}

egen edatt_less_than_primary = rsum(edatt_LTP*)
egen edatt_primary = rsum(edatt_primary*)
egen edatt_secondary = rsum(edatt_secondary*)
egen edatt_university = rsum(edatt_university*)

forval n = 0/18 {
	egen edatt_yrs_`n' = rsum(edatt_yrs_`n'_*)
}

forval n = 0/18 {
	drop edatt_yrs_`n'_*
}

keep nuts edatt_less_than_primary-edatt_university edatt_yrs_*     

egen edatt_total = rsum(edatt_less_than_ edatt_primary edatt_secondary edatt_university)

merge 1:1 nuts using "$scratch/IC_EU_IPUMS.dta", nogen update 

keep nuts edatt_yrs* EMP* home* hh_size 
egen edatt_total = rsum(edatt_yrs*) 

gen EMP_share_female = EMP_female / EMP *100 

save "$dta_files/IC_EU_IPUMS.dta", replace 

! compress ipumsi_00012.dat
rm  "$scratch/IC_EU_IPUMS.dta"
 

